In [ ]:
import re # Regular Expressions
import pandas as pd # DataFrames & Manipulation
import nltk.data # Sentence tokenizer
from bs4 import BeautifulSoup # HTML processing
from gensim.models.doc2vec import LabeledSentence, Doc2Vec
In [ ]:
train_input = "../data/recipes.tsv.bz2"
# keep empty strings (http://pandas-docs.github.io/pandas-docs-travis/io.html#na-values)
train = pd.read_csv(train_input, delimiter="\t", quoting=3, encoding="utf-8", keep_default_na=False)
In [ ]:
# load sentence tokenizer model and initialize for german language
nltk.download("punkt")
tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
In [ ]:
def normalize( text ):
"""
Remove HTML, non-letter characters, and convert to lower case.
Return list of words.
"""
# remove HTML markup with BeautifulSoup (and keep spaces after removal)
plainText = " ".join(BeautifulSoup(text, 'html.parser').strings)
# retain only letters (include umlauts)
onlyLetters = re.sub(u"[^a-zA-ZäöüÄÖÜß]", " ", plainText)
# get lower case words
words = onlyLetters.lower().split()
return words
def split_sentences(text):
""" Split text by sentences and clean each sentence. """
return filter(None, [normalize(sentence) for sentence in tokenizer.tokenize(text)])
In [ ]:
sentences = []
size = train['instructions'].size
for i in xrange ( 0, size ):
if (i+1) % 10000 == 0:
print "Processing %d of %d recipies." % ( i+1, size )
# either keep complete text or split into sentences but label all parts with the same ID
sentences.append(LabeledSentence(normalize(train['instructions'][i]), [i]))
#sentences += [LabeledSentence(words, [i]) for words in split_sentences(train['instructions'][i])]
In [ ]:
print "Total: %d sentences.\n" % len(sentences)
In [ ]:
# Set values for various parameters
num_features = 300 # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
In [ ]:
# Import the built-in logging module and configure it so that Word2Vec creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [ ]:
print "Training model..."
model = Doc2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
In [ ]:
vec = model.docvecs.most_similar(1)
ids = [k for (k,v) in vec]
ids
In [ ]:
vec
In [ ]:
train.loc[ids]
In [ ]:
model.most_similar('pasta')